# Call libraries if not already loaded
require(tidyverse)
require(glue)

# Filter to quality responses
g <- g %>% rename(dur = Duration..in.seconds.)
g$dur <- as.numeric(g$dur) / 60
g <- filter(g, gc == 1)
g <- filter(g, qualtricsCountry == "United States")

message(glue("Number of respondents: {nrow(g)}"))
message(glue("Median duration: {median(g$dur)} minutes"))

# Covariates----
g$black <- ifelse(grepl("Black or African American", g$Race), 1, 0)
g$asian <- ifelse(grepl("Asian", g$Race), 1, 0)
g$white <- ifelse(g$Race == "White", 1, 0)
g$otherrace <- ifelse(grepl("Native|Other", g$Race), 1, 0)

g <- g %>% rename(hispanic = Hispanic)
g$hispanic <- ifelse(g$hispanic == "Yes", 1, 0)

g <- g %>%
  mutate(
    region = case_when(
      State %in% c("Illinois", "Indiana", "Iowa", "Kansas", "Michigan", "Minnesota", "Missouri", "Nebraska", "North Dakota", "Ohio", "South Dakota", "Wisconsin") ~ "Midwest",
      State %in% c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont", "New Jersey", "New York", "Pennsylvania", "District of Columbia") ~ "Northeast",
      State %in% c("Delaware", "Florida", "Georgia", "Maryland", "North Carolina", "South Carolina", "Virginia", "West Virginia", "Alabama", "Kentucky", "Mississippi", "Tennessee", "Arkansas", "Louisiana", "Oklahoma", "Texas") ~ "South",
      State %in% c("Arizona", "Colorado", "Idaho", "Montana", "Nevada", "New Mexico", "Utah", "Wyoming", "Alaska", "California", "Hawaii", "Oregon", "Washington") ~ "West"
    ),
    region = factor(region),
    State = factor(State)
  )

g$female <- ifelse(g$Gender == "Female", 1, 0)

g$college <- ifelse(grepl("Advanced|Bach", g$edu), 1, 0)
g <- g %>%
  mutate(
    edu3 = case_when(
      edu %in% c("No High School", "Some High School", "High School Diploma or GED") ~ "High School or less",
      edu %in% c("Some college course work but non-degree or certificate", "Technical Certificate", "Associate Degree") ~ "Some College",
      edu %in% c("Bachelor's Degree", "Advanced degree (post college, such as JD or MBA)") ~ "BA or higher"
    )
  )
g$edu3 <- relevel(factor(g$edu3), ref = "High School or less")
g$edu_hsless <- ifelse(g$edu3 == "High School or less", 1, 0)
g$edu_somecol <- ifelse(g$edu3 == "Some College", 1, 0)
g$edu_college <- ifelse(g$edu3 == "BA or higher", 1, 0)

base_year <- 2024
g$age <- base_year - as.numeric(g$YearBorn)

g <- g %>%
  mutate(
    ideo5 = factor(Ideo, levels = c("Very conservative", "Conservative", "Moderate", "Liberal", "Very liberal", "Not sure")),
    ideo3 = case_when(
      Ideo %in% c("Conservative", "Very conservative") ~ "Conservative",
      Ideo %in% c("Liberal", "Very liberal") ~ "Liberal",
      Ideo %in% c("Moderate") ~ "Moderate",
      Ideo %in% c("Not sure") ~ "Not sure"
    ),
    lib = ifelse(ideo3 == "Liberal", 1, 0),
    con = ifelse(ideo3 == "Conservative", 1, 0)
  )
g$ideo3 <- relevel(factor(g$ideo3), ref = "Moderate")

g <- g %>% 
  mutate(
    pid7 = case_when(
      DemStrength == "Strong Democrat" ~ "Strong Democrat",
      DemStrength == "Not so strong Democrat" ~ "Not so strong Democrat",
      RepStrength == "Strong Republican" ~ "Strong Republican",
      RepStrength == "Not so strong Republican" ~ "Not so strong Republican",
      IndepLean == "The Democratic Party\t" ~ "Lean Democrat",
      IndepLean == "The Republican Party\t" ~ "Lean Republican",
      IndepLean == "Neither\t" ~ "Independent",
      IndepLean == "Not sure" ~ "Not sure"
    ),
    pid7_scale = case_when(
      pid7 == "Strong Democrat" ~ 1,
      pid7 == "Not so strong Democrat" ~ 2,
      pid7 == "Lean Democrat" ~ 3,
      pid7 == "Independent" ~ 4,
      pid7 == "Not sure" ~ 4,
      pid7 == "Lean Republican" ~ 5,
      pid7 == "Not so strong Republican" ~ 6,
      pid7 == "Strong Republican" ~ 7
    ),
    pid3 = case_when(
      pid7_scale %in% c(1,2,3) ~ "Democrat",
      pid7_scale == 4 ~ "Neither",
      pid7_scale %in% c(5,6,7) ~ "Republican"
    ),
    pid_block = case_when(
      pid7_scale %in% c(1,2) ~ "Democrat",
      pid7_scale %in% c(3,4,5) ~ "Neither",
      pid7_scale %in% c(6,7) ~ "Republican"
    ),
    rep = ifelse(pid3 == "Republican", 1, 0),
    dem = ifelse(pid3 == "Democrat", 1, 0)
    
  )
g$pid3 <- relevel(factor(g$pid3), ref = "Democrat")
g$pid_block <- relevel(factor(g$pid_block), ref = "Democrat")

# Income processing with proper imputation
income_levels <- c("Less than $10,000", "$10,000 - $19,999", "$20,000 - $29,999", "$30,000 - $39,999", "$40,000 - $49,999", "$50,000 - $59,999", "$60,000 - $69,999", "$70,000 - $79,999", "$80,000 - $99,999", "$100,000 - $119,999", "$120,000 - $149,999", "$150,000 - $199,999", "$200,000 - $249,999", "$250,000 - $349,999", "$350,000 - $499,999", "$500,000 or more")

g$income_imp <- with(g, ifelse(Income == "Prefer not to say", NA, Income))
g$income_imp <- factor(g$income_imp, ordered = TRUE, levels = income_levels)

# Impute missing income with median level
median_position <- median(as.integer(g$income_imp), na.rm = TRUE)
median_level <- levels(g$income_imp)[median_position]
message(glue("Imputing missing income values with: {median_level}"))
g$income_imp <- ifelse(is.na(g$income_imp), median_level, as.character(g$income_imp))
g$income_imp <- factor(g$income_imp, ordered = TRUE, levels = income_levels)
g <- g %>%
  mutate(
    income5 = case_when(
      #Income quintiles
      #https://www.census.gov/content/dam/Census/library/publications/2023/demo/p60-279.pdf
      income_imp %in% c("Less than $10,000", "$10,000 - $19,999", "$20,000 - $29,999") ~ "Q1",#30,000 =20th percentile
      income_imp %in% c("$30,000 - $39,999", "$40,000 - $49,999", "$50,000 - $59,999") ~ "Q2",#58,020 =40th percentile
      income_imp %in% c("$60,000 - $69,999", "$70,000 - $79,999", "$80,000 - $99,999") ~ "Q3",#94,000 = 60th percentile
      income_imp %in% c("$100,000 - $119,999", "$120,000 - $149,999") ~ "Q4",#153,000=80th percentile
      income_imp %in% c("$150,000 - $199,999", "$200,000 - $249,999", "$250,000 - $349,999", "$350,000 - $499,999", "$500,000 or more") ~ "Q5"
    ),
    #https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwiK6qSb8saDAxVFRTABHfLsAZMQFnoECBMQAw&url=https%3A%2F%2Fwww.census.gov%2Flibrary%2Fpublications%2F2023%2Fdemo%2Fp60-279.html%23%3A~%3Atext%3DHighlights%2Cand%2520Table%2520A%252D1).&usg=AOvVaw1pHbMRbyprlFZQrZHYtiL4&opi=89978449
    #Real median household income was $74,580 in 2022, a 2.3 percent decline from the 2021 estimate of $76,330 (Figure 1 and Table A-1).
    income_bin = ifelse(as.numeric(income_imp) >= 8, 1, 0)
  )
g$income5 <- relevel(factor(g$income5), ref = "Q1")
g$incomeq1 <- ifelse(g$income5 == "Q1", 1, 0)
g$incomeq2 <- ifelse(g$income5 == "Q2", 1, 0)
g$incomeq3 <- ifelse(g$income5 == "Q3", 1, 0)
g$incomeq4 <- ifelse(g$income5 == "Q4", 1, 0)
g$incomeq5 <- ifelse(g$income5 == "Q5", 1, 0)

g$employ_bin <- ifelse(g$employ %in% c("Employed full-time", "Employed part-time"), 1, 0)
g$worker <- ifelse(g$employ %in% c("Employed full-time", "Employed part-time", "Not employed, but looking for work"), 1, 0)

g <- g %>%
  mutate(
    age_quota = case_when(
      age >= 18 & age <= 34 ~ "18-34",
      age >= 35 & age <= 54 ~ "35-54",
      age >= 55 ~ "55+"
    ),
    age_quota = factor(age_quota)
  )
g <- g %>%
  mutate(
    income_quota = case_when(
      Income %in% c("Less than $10,000", "$10,000 - $19,999", "$20,000 - $29,999", "$30,000 - $39,999", "$40,000 - $49,999") ~ "less_50k",
      Income %in% c("$50,000 - $59,999", "$60,000 - $69,999", "$70,000 - $79,999", "$80,000 - $99,999") ~ "50k-99k",
      Income %in% c("$100,000 - $119,999", "$120,000 - $149,999", "$150,000 - $199,999", "$200,000 - $249,999", "$250,000 - $349,999", "$350,000 - $499,999", "$500,000 or more") ~ "100k+",
      TRUE ~ "Prefer not say"
    )
  )